ERCOT-Full Dataset¶

In [1]:
#importing necessary libraries
import seaborn as sns
color_pal = sns.color_palette()
from matplotlib import pyplot as plt
plt.style.use('fivethirtyeight')
import pandas as pd 
import numpy as np
from datetime import datetime, timedelta
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from datetime import datetime
import datetime
from sklearn.model_selection import TimeSeriesSplit
import plotly.io as pio
pio.renderers.default = 'notebook'
import plotly.graph_objects as go
import scipy.stats as stats

1. Loading/Cleaning Weather Data¶

  • (1.1) reading weather file from: Jan 2015- Jan 2017
  • (1.2) adding in data from: Jan 2017- Jan 2023
  • (1.3) adding in data from: Jan 2023-Present
  • (1.4) cleaning up weather data: removing null values, converting date and setting to index in DateTime format
In [2]:
#(1.1) reading weather file from: Jan 2015- Jan 2017
weatherdf = pd.read_csv("./Weather_Data/2015-2017.csv")

#(1.2) adding in data from: Jan 2017- Jan 2023
for year in range(17,23,2):
    weatherdf = pd.concat([weatherdf, pd.read_csv(f"./Weather_Data/{2000+year}-{2002+year}.csv")], axis=0)
    
#(1.3) adding in data from: Jan 2023-Present
weatherdf = pd.concat([weatherdf, pd.read_csv(f"./Weather_Data/2023.csv")], axis=0)

#(1.4) cleaning up weather data: removing null values, converting date and setting to index in DateTime format
weatherdf['Hour'] = weatherdf['Hour'].astype(str)
weatherdf["Time"] = weatherdf["Date"] + " " + weatherdf["Hour"] + ":00"
weatherdf["Time"] = pd.to_datetime(weatherdf["Time"])
weatherdf = weatherdf.set_index('Time')
weatherdf = weatherdf.drop(columns=["Date", "Hour"])
weatherdf = weatherdf[ weatherdf["Temp"] != -99]

#displaying summary of weather data
weatherdf.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 74556 entries, 2015-01-01 00:00:00 to 2023-06-30 23:00:00
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Temp              74556 non-null  int64
 1   Dew Point         74556 non-null  int64
 2   Heat Index        74556 non-null  int64
 3   Wind Speed (mph)  74556 non-null  int64
 4   RH                74556 non-null  int64
dtypes: int64(5)
memory usage: 3.4 MB

2. Loading/Cleaning Load Data¶

  • (2.1) reading native load file from: Jan 2015- Jan 2016
  • (2.2) adding in data from: Jan 2016- Present https://www.ercot.com/gridinfo/load/load_hist
  • (2.3) first load data clean-up: removing null values, rounding values
  • (2.4) handling daylight savings times (delete manually in files)
  • (2.5) remove other errors in time such as leading zeros/other inconsistencies, rounding hours
In [3]:
#(2.1) reading native load file from: Jan 2015- Jan 2016
loadf = pd.read_csv("./loads/Native_Load_2015.csv")

#(2.2) adding in data from: Jan 2016- Present
for year in range(16,24):
    loadf = pd.concat([loadf, pd.read_csv(f"./loads/Native_Load_{year+2000}.csv")], axis=0)
    
#(2.3) cleaning up load data: removing null values, rounding values
loadf.info()
loadf = loadf.reset_index(drop=True)
loadf = loadf.dropna()
loadf = loadf.replace(',','', regex=True)
cols = loadf.columns
cols.to_numpy()
cols = np.delete(cols, 0)
for column in cols:
    loadf[f"{column}"] = pd.to_numeric(loadf[f"{column}"],errors = 'coerce').round(2)

#(2.4) handling daylight savings times (delete manually in files)
#loadf.index[loadf['Hour Ending'] == '11/5/2017 02:00 DST']
#Ex: 11/01/2020 02:00 DST, 11/07/2021 02:00 DST, 11/06/2022 02:00 DST

#(2.5) remove other errors in time such as leading zeros/other inconsistencies, rounding hours
listOfHours = loadf["Hour Ending"].tolist()

#going through each time individually and checking for/fixing bugs
for list in range(len(listOfHours)):
    
    #midnight error in 2023 file
    if len(listOfHours[list]) < 6:
        date2 = pd.to_datetime(listOfHours[list-1], format='%m/%d/%y %H:%M') + timedelta(minutes=59)
        date2str = date2.strftime('%m/%d/%y %H:%M')
        listOfHours[list] = date2str
    
    #leading zero error for month
    if(listOfHours[list][0] == '0'):
         listOfHours[list] = listOfHours[list][1:]
    
    #leading zero error for date
    if ('/0' in listOfHours[list]):
         listOfHours[list] = listOfHours[list].replace("/0", "/" )
            
    #abbreviating years
    if ('/2021' in listOfHours[list]):
         listOfHours[list] = listOfHours[list].replace("/2021", "/21" )
    if ('/2022' in listOfHours[list]):
         listOfHours[list] = listOfHours[list].replace("/2022", "/22" )
    if ('/2023' in listOfHours[list]):
         listOfHours[list] = listOfHours[list].replace("/2023", "/23" )
    
    #changing midnight to be consistent (0:00 not 24:00)
    if ('24:00' in listOfHours[list]):
         listOfHours[list] = listOfHours[list].replace(" 24:00", " 0:00")
    
    #any other trailing zeros
    if(not ' 0:' in listOfHours[list]):
        if(not ' 0' in listOfHours[list]):
            listOfHours[list] = listOfHours[list].replace(" 0", " ")
    
    
#updating times and setting to index in DateTime format
loadf["Hour Ending"] = listOfHours
loadf = loadf.reset_index(drop=True)
loadf["Hour Ending"] = loadf["Hour Ending"].apply(lambda x: pd.to_datetime(x, format='%m/%d/%y %H:%M').round('h'))
pd.to_datetime(loadf["Hour Ending"], format='%m/%d/%y %H:%M')
loadf = loadf.set_index("Hour Ending").rename_axis('Time', axis='index')

#displaying summary of load data
<class 'pandas.core.frame.DataFrame'>
Int64Index: 74464 entries, 0 to 4341
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Hour Ending  74464 non-null  object
 1   COAST        74463 non-null  object
 2   EAST         74463 non-null  object
 3   FWEST        74463 non-null  object
 4   NORTH        74463 non-null  object
 5   NCENT        74463 non-null  object
 6   SOUTH        74463 non-null  object
 7   SCENT        74463 non-null  object
 8   WEST         74463 non-null  object
 9   ERCOT        74463 non-null  object
dtypes: object(10)
memory usage: 6.2+ MB

3. Merging and Saving Datasets¶

  • (3.1) merging data and saving to "ERCOT-Full_Data.csv"
In [4]:
#(3.1) merging data and saving to "ERCOT_DATA.csv"
finaldata = pd.merge(weatherdf, loadf["ERCOT"], left_index=True, right_index=True)
#dropping null values
finaldata = finaldata.dropna()
#setting index
finaldata.rename_axis('Time', axis='index')
finaldata.to_csv('ERCOT-Full_Data.csv')

4. Setting up the Model¶

  • (4.1) re-reading data
  • (4.2) defining characteristics
  • (4.3) adding time lags characteristic
In [5]:
#(4.1) re-reading data
df = pd.read_csv("./ERCOT-Full_Data.csv")
df["Time"] = pd.to_datetime(df["Time"])
df = df.set_index("Time")
df.rename_axis('DATE', axis='index')

#(4.2) defining characteristics
def create_features(df):
    """
    Create time series features based on time series index.
    """
    df = df.copy()
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    df['weekofyear'] = df.index.isocalendar().week
    return df
df = create_features(df)

#(4.3) adding time lags characteristic
def add_lags(df):
    target_map = df['ERCOT'].to_dict()
    df['lag1'] = (df.index - pd.Timedelta('364 days')).map(target_map)
    df['lag2'] = (df.index - pd.Timedelta('728 days')).map(target_map)
    df['lag3'] = (df.index - pd.Timedelta('1092 days')).map(target_map)
    return df
df = add_lags(df)

Plots¶

In [6]:
#displaying correlations with a heatmap
sns.heatmap(df.corr())
Out[6]:
<AxesSubplot:>
No description has been provided for this image
In [7]:
#plotting MW by hour
fig, ax = plt.subplots(figsize=(10, 4))
sns.boxplot(data=df, x='hour', y='ERCOT', palette='rocket')
ax.set_title('MW by Hour')
plt.show()
No description has been provided for this image
In [8]:
#plotting MW by month
fig, ax = plt.subplots(figsize=(10, 4))
sns.boxplot(data=df, x='month', y='ERCOT', palette='crest')
ax.set_title('MW by Month')
plt.show()
No description has been provided for this image

5. Running the Model¶

  • (5.1) splitting data to train and test
  • (5.2) splitting to train and test, adding time/refining characteristics
  • (5.3) defining xgboost regressor function imported from libary
  • (5.4) fitting the regression
  • (5.5) intepreting and saving output from fold
In [9]:
#(5.1) splitting data to 5 series
tss = TimeSeriesSplit(n_splits=5, test_size=24*365*1, gap=24)
df = df.sort_index()

fold = 0
preds = []
tests = []
scores = []
dates = []

#looping through each split
for train_idx, val_idx in tss.split(df):
    
    #(5.2) splitting to train and test, adding time/refining characteristics
    train = df.iloc[train_idx]
    test = df.iloc[val_idx]
    train = create_features(train)
    test = create_features(test)
    
    FEATURES = ['dayofyear', 'hour', 'dayofweek', 'quarter', 'month','year','Temp', 'Dew Point', 'Heat Index', 'Wind Speed (mph)', 'RH', 'lag1', 'lag2', 'lag3']
    TARGET = 'ERCOT'

    X_train = train[FEATURES]
    y_train = train[TARGET]
    X_test = test[FEATURES]
    y_test = test[TARGET]
    dates.append(X_test.index)

    #(5.3) defining xgboost regressor function imported from libary 
    reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree',    
                           n_estimators=1000,
                           early_stopping_rounds=50,
                           objective='reg:squarederror',
                           max_depth=3,
                           learning_rate=0.01)
    
    #(5.4) fitting the regression
    reg.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            verbose=100)
    
    #(5.5) intepreting and saving output from fold
    y_pred = reg.predict(X_test)
    tests.append(y_test)
    preds.append(y_pred)
    score = np.sqrt(mean_squared_error(y_test, y_pred))
    scores.append(score)
[0]	validation_0-rmse:40992.96724	validation_1-rmse:43167.60046
[100]	validation_0-rmse:15403.39421	validation_1-rmse:17430.72870
[200]	validation_0-rmse:6265.62196	validation_1-rmse:7792.90386
[300]	validation_0-rmse:3256.90206	validation_1-rmse:4188.86569
[400]	validation_0-rmse:2383.43343	validation_1-rmse:2777.52292
[500]	validation_0-rmse:2099.05960	validation_1-rmse:2285.79443
[600]	validation_0-rmse:1968.81538	validation_1-rmse:2099.68923
[700]	validation_0-rmse:1892.74437	validation_1-rmse:2005.35838
[800]	validation_0-rmse:1834.74048	validation_1-rmse:1945.35235
[900]	validation_0-rmse:1787.39542	validation_1-rmse:1907.37107
[999]	validation_0-rmse:1747.59078	validation_1-rmse:1882.96587
[0]	validation_0-rmse:41483.80374	validation_1-rmse:44487.98210
[100]	validation_0-rmse:15593.53524	validation_1-rmse:17661.01302
[200]	validation_0-rmse:6337.64085	validation_1-rmse:7720.97180
[300]	validation_0-rmse:3282.91072	validation_1-rmse:4164.97081
[400]	validation_0-rmse:2409.40580	validation_1-rmse:2952.37406
[500]	validation_0-rmse:2135.70714	validation_1-rmse:2549.31529
[600]	validation_0-rmse:2008.15912	validation_1-rmse:2375.91870
[700]	validation_0-rmse:1929.85562	validation_1-rmse:2284.38968
[800]	validation_0-rmse:1872.13117	validation_1-rmse:2228.53187
[900]	validation_0-rmse:1823.04984	validation_1-rmse:2191.59841
[999]	validation_0-rmse:1782.85417	validation_1-rmse:2162.00461
[0]	validation_0-rmse:42039.84476	validation_1-rmse:44765.59073
[100]	validation_0-rmse:15806.75072	validation_1-rmse:17977.56916
[200]	validation_0-rmse:6440.25841	validation_1-rmse:7996.06568
[300]	validation_0-rmse:3369.16198	validation_1-rmse:4436.98635
[400]	validation_0-rmse:2497.30242	validation_1-rmse:3292.11203
[500]	validation_0-rmse:2226.43945	validation_1-rmse:2932.50650
[600]	validation_0-rmse:2093.95779	validation_1-rmse:2790.43088
[700]	validation_0-rmse:2011.74709	validation_1-rmse:2731.90832
[800]	validation_0-rmse:1953.26799	validation_1-rmse:2699.67946
[900]	validation_0-rmse:1901.43759	validation_1-rmse:2673.18317
[999]	validation_0-rmse:1857.52879	validation_1-rmse:2658.05257
[0]	validation_0-rmse:42467.65058	validation_1-rmse:47924.67780
[100]	validation_0-rmse:15976.97048	validation_1-rmse:20291.80872
[200]	validation_0-rmse:6512.80411	validation_1-rmse:10050.41793
[300]	validation_0-rmse:3435.83783	validation_1-rmse:6244.60204
[400]	validation_0-rmse:2566.78371	validation_1-rmse:4855.50602
[500]	validation_0-rmse:2301.16963	validation_1-rmse:4189.71192
[600]	validation_0-rmse:2182.17309	validation_1-rmse:3915.02671
[700]	validation_0-rmse:2106.11622	validation_1-rmse:3766.12633
[800]	validation_0-rmse:2048.87763	validation_1-rmse:3694.68887
[900]	validation_0-rmse:1998.30372	validation_1-rmse:3601.00688
[999]	validation_0-rmse:1956.07804	validation_1-rmse:3534.29333
[0]	validation_0-rmse:43227.28666	validation_1-rmse:49375.15567
[100]	validation_0-rmse:16296.42096	validation_1-rmse:20874.38370
[200]	validation_0-rmse:6698.35776	validation_1-rmse:9857.74442
[300]	validation_0-rmse:3581.44005	validation_1-rmse:5306.01925
[400]	validation_0-rmse:2690.67192	validation_1-rmse:3589.58860
[500]	validation_0-rmse:2411.77092	validation_1-rmse:3005.81901
[600]	validation_0-rmse:2281.39950	validation_1-rmse:2756.06634
[700]	validation_0-rmse:2191.46724	validation_1-rmse:2626.17951
[800]	validation_0-rmse:2120.88125	validation_1-rmse:2539.85919
[900]	validation_0-rmse:2064.50817	validation_1-rmse:2484.17576
[999]	validation_0-rmse:2021.03117	validation_1-rmse:2452.50386

Analyzing Results¶

In [10]:
#making csv tracking predictions for each hour and actual results
analysis = pd.DataFrame({'Date': np.concatenate([dates[0], dates[1], dates[2], dates[3], dates[4]]), 'Prediction': np.concatenate([preds[0], preds[1], preds[2], preds[3], preds[4]]), 'Actual': np.concatenate([tests[0].values, tests[1].values, tests[2].values, tests[3].values, tests[4].values])})
analysis["% Error"] = ((analysis["Prediction"]-analysis["Actual"])/analysis["Actual"]).abs()*100
analysis.to_csv('ERCOT-Full_Results.csv')

print("% Error Statistics:")

#printing error metrics, box-plot
print(analysis["% Error"].describe())
sns.boxplot(x=analysis["% Error"])

#calculating and displaying rmse
print(f'Model RMSE: {np.mean(scores):0.4f}')
% Error Statistics:
count    43800.000000
mean         4.287299
std          3.508268
min          0.000130
25%          1.623331
50%          3.461501
75%          6.097114
max         41.689184
Name: % Error, dtype: float64
Model RMSE: 2537.8785
No description has been provided for this image
In [11]:
#displaying each factor's importance
importances = pd.DataFrame(data=reg.feature_importances_,
             index=reg.feature_names_in_,
             columns=['importance'])
importances.sort_values('importance').plot(kind='barh', title='Feature Importance')
plt.show()


#scatter and line chart to trace difference in predictions and actual
fig = go.Figure()
fig.update_layout(
    title="Prediction vs Actual Scatter",
   
)
fig.add_trace(go.Scatter(x=analysis.head(1000).index, y=analysis["Prediction"].head(1000),
                    mode='markers',
                    name='Prediction'))
fig.add_trace(go.Scatter(x=analysis.head(1000).index, y=analysis["Actual"].head(1000),
                    mode='lines',
                    name='Actual'))
fig.show()

#histogram to trace difference in predictions and actual
fig = go.Figure()
fig.add_trace(go.Histogram(x=analysis["Prediction"], name='Prediction'))
fig.add_trace(go.Histogram(x=analysis["Actual"], name='Actual'))
fig.update_layout(
    title="Histogram of Model Results",
   
)
fig.show()
No description has been provided for this image

7. Predicting the Future: July 2023¶

  • (7.1) define future dataframe (set date)
  • (7.2) adding future to full-dataset, making copy, adding all factors
    • set features for forecast
  • (7.3) making predictions for the future, plotting and saving them
In [12]:
#(7.1) define future dataframe (set date)
future = pd.date_range('2023-07-01','23:00 2023-07-31', freq='1h')
future_df = pd.DataFrame(index=future)
#(7.2) adding future to full-dataset, making copy, adding all factors
future_df['isFuture'] = True
df['isFuture'] = False
df_and_future = pd.concat([df, future_df])
df_and_future = create_features(df_and_future)
df_and_future = add_lags(df_and_future)
future_w_features = df_and_future.query('isFuture').copy()


#*set features for forecast
future_w_features["Temp"]
future_w_features["Dew Point"]
future_w_features["Heat Index"]


#(7.3) making predictions for the future, plotting and saving them
future_w_features['Prediction'] = reg.predict(future_w_features[FEATURES])
future_w_features['Prediction'].plot(figsize=(10, 5),
                               color=color_pal[4],
                               ms=1,
                               lw=1,
                               title='Future Predictions')
#saving to a Prediction csv
future_w_features['Prediction'].to_csv('ERCOT-Full_Predictions.csv')

#saving to a full csv
source = pd.DataFrame(columns=['Prediction', 'Actual', '% Error'])
source["Prediction"] = future_w_features['Prediction']
source = source.reset_index()
source = source.fillna(0)
source.rename(columns = {'index':'Date'}, inplace = True)
pred_and_past = pd.concat([analysis, source], axis=0)
pred_and_past.to_csv('ERCOT-Full_Past-Predictions.csv')

#showing plot of predictions
plt.show()
No description has been provided for this image